library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(modelr)
library(viridis)
## Loading required package: viridisLite
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(latex2exp)
df = read_csv("../../analysis_data/all_data.csv")
## Rows: 81428 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): librarian, vectorizer, center, fields_of_study_0
## dbl (3): density, edginess, citations_per_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 7
##   density edginess citations_per_year librarian vectorizer center        field…¹
##     <dbl>    <dbl>              <dbl> <chr>     <chr>      <chr>         <chr>  
## 1    623.    0.721               8.86 S2        GPT2       hafenLowreds… Physics
## 2   1784.    0.528              59.7  S2        GPT2       hafenLowreds… Physics
## 3   1768.    0.591              20.4  S2        GPT2       hafenLowreds… Physics
## 4   1409.    0.487               1    S2        GPT2       hafenLowreds… Physics
## 5   1858.    0.552              14.2  S2        GPT2       hafenLowreds… Physics
## 6   1486.    0.435              11.3  S2        GPT2       hafenLowreds… Physics
## # … with abbreviated variable name ¹​fields_of_study_0
# TEMP: Filter first to frequent vals, this is disatisfying bandaid for BOW
# df <- df %>% group_by(density) %>% filter(n() >= 50)
# In fact, it disables viewing the other interesting ones. We might simply be unable to automatically facet, unless our data is already heavily transformed in a specific way for BOW/Word2Vec vectorizers vs. neural LM vecs


df_grouped_z <- df %>% 
  group_by(
    vectorizer,
    center
  ) %>% 
  mutate(
    density_z = scale(density),
    cpy_z = scale(citations_per_year),
  )


# Filter to 2 stds, for both vars
df_grouped_zf <- df_grouped_z %>% 
  filter(
    (
      density_z >= -2
      & 
      density_z <= 2
      # density <= median(df_grouped_z$density)
    ),
    (
      cpy_z >= -2
      &
      cpy_z <= 0 # mean=0, and it might be the only way to see everything. But it also might exclude things; unfortunately this might need to be outsourced to a config and plots tweaked.
    ) 
  )
(
    ggplot(
        df_grouped_zf,
        mapping=aes(
            x=density_z,
            # x=density, # NOTE that we filter by z-scale, but can still plot the orig values.
            y=citations_per_year,
            # y=cpy_z,
        )
    )
    + geom_density_2d_filled(
        contour_var = "ndensity",
        # alpha=0.2,
    )
    + scale_fill_viridis(option = "viridis", discrete = TRUE)
    # + xlab("Density z-scaled")
    + xlab("Density")
    + ylab("Citations per year")
    + geom_smooth(color="orange", size=2, method="loess", span=.3)
    + geom_point(
        alpha=0.05,
        color="white",
        size=1,
    )    
    + theme(
        # axis_title_y=element_blank(),
        axis.title=element_text(size=18),
    )
    
    # IMPORTANT: facet by the groupby variables, 
    # This should make the distribution no longer bimodal.
    + facet_grid(vectorizer ~ center)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `geom_smooth()` using formula = 'y ~ x'

# Use only one group.

df_test <- df_grouped_zf %>% filter(
  vectorizer == "SciBERT", 
  center == "hafenLowredshiftLymanLimit2017"
)

(
    ggplot(
        df_test,
        mapping=aes(
            # x=density_z,
            x=density, # NOTE that we filter by z-scale, but can still plot the orig values.
            y=citations_per_year,
            # y=cpy_z,
        )
    )
    + geom_density_2d_filled(
        contour_var = "ndensity",
        # alpha=0.2,
    )
    + scale_fill_viridis(option = "viridis", discrete = TRUE)
    # + xlab("Density z-scaled")
    + xlab("Density")
    + ylab("Citations per year")
    + geom_smooth(color="orange", size=2, method="loess", span=.3)
    + geom_point(
        alpha=0.05,
        color="white",
        size=1,
    )    
    + theme(
        # axis_title_y=element_blank(),
        axis.title=element_text(size=18),
    )
    
 )
## `geom_smooth()` using formula = 'y ~ x'

df_test <- df_grouped_zf %>% filter(
  vectorizer == "BOW", 
  center == "hafenLowredshiftLymanLimit2017"
)

(
    ggplot(
        df_test,
        mapping=aes(
            # x=density_z,
            x=density, # NOTE that we filter by z-scale, but can still plot the orig values.
            y=citations_per_year,
            # y=cpy_z,
        )
    )
    + geom_density_2d_filled(
        contour_var = "ndensity",
        # alpha=0.2,
    )
    + scale_fill_viridis(option = "viridis", discrete = TRUE)
    # + xlab("Density z-scaled")
    + xlab("Density")
    + ylab("Citations per year")
    + geom_smooth(color="orange", size=2, method="loess", span=.3)
    + geom_point(
        alpha=0.05,
        color="white",
        size=1,
    )    
    + theme(
        # axis_title_y=element_blank(),
        axis.title=element_text(size=18),
    )
    
 )
## `geom_smooth()` using formula = 'y ~ x'

df_test <- df_grouped_zf %>% filter(
  vectorizer == "Word2Vec", 
  center == "hafenLowredshiftLymanLimit2017"
)

(
    ggplot(
        df_test,
        mapping=aes(
            # x=density_z,
            x=density, # NOTE that we filter by z-scale, but can still plot the orig values.
            y=citations_per_year,
            # y=cpy_z,
        )
    )
    + geom_density_2d_filled(
        contour_var = "ndensity",
        # alpha=0.2,
    )
    + scale_fill_viridis(option = "viridis", discrete = TRUE)
    # + xlab("Density z-scaled")
    + xlab("Density")
    + ylab("Citations per year")
    + geom_smooth(color="orange", size=2, method="loess", span=.3)
    + geom_point(
        alpha=0.05,
        color="white",
        size=1,
    )    
    + theme(
        # axis_title_y=element_blank(),
        axis.title=element_text(size=18),
    )
    
 )
## `geom_smooth()` using formula = 'y ~ x'